
test_file = '../db/MM_file/train_data/train.txt'  # 训练语料
test_file2 = '../db/MM_file/test_data/0.txt' # 测试语料
test_file3 = '../db/MM_file/test_sc/0_BMM_sc.txt'  # 生成结果

##后向最大匹配
def get_dic(test_file):
    with open(test_file, 'r', encoding='utf-8', ) as f:
        try:
            file_content = f.read().split()
        finally:
            f.close()
    chars = list(set(file_content))
    return chars


dic = get_dic(test_file)


def readfile(test_file2):
    max_length = 5

    h = open(test_file3, 'w', encoding='utf-8', )
    with open(test_file2, 'r', encoding='utf-8', ) as f:
        lines = f.readlines()

    for line in lines:
        my_stack = []
        len_hang = len(line)
        while len_hang > 0:
            tryWord = line[-max_length:]
            while tryWord not in dic:
                if len(tryWord) == 1:
                    break
                tryWord = tryWord[1:]
            my_stack.append(tryWord)
            line = line[0:len(line) - len(tryWord)]
            len_hang = len(line)

        while len(my_stack):
            t = my_stack.pop()
            if t == '\n':
                h.write('\n')
            else:
                h.write(t + "  ")

    h.close()


readfile(test_file2)
